package org.webmagic.xml;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

import org.apache.log4j.xml.DOMConfigurator;
import org.webmagic.xml.bean.Spider;

import us.codecraft.webmagic.model.xml.Xml2Models;

public class Crawler {

    public static void main(String[] args) throws IOException {
        if (null != args && args.length == 2) {
            if (args[0].equals("--create")) {
                File fe = new File("./" + args[1]);
                String strVar = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n";
                strVar += "<spider>\r\n";
                strVar += " <site>\r\n";
                strVar += "     <!-- 模板标识 -->\r\n";
                strVar += "     <domain>iqiyi</domain>\r\n";
                strVar += "     <!-- 浏览器标识 -->\r\n";
                strVar += "     <userAgent>\r\n";
                strVar += "       <![CDATA[\r\n";
                strVar += "           Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 QIHU 360SE\r\n";
                strVar += "         ]]>\r\n";
                strVar += "     </userAgent>\r\n";
                strVar += "     <!-- 页面编码 -->\r\n";
                strVar += "     <charset>UTF-8</charset>\r\n";
                strVar += "     <!-- 休眠时间 -->\r\n";
                strVar += "     <sleepTime>200</sleepTime>\r\n";
                strVar += "     <!-- 立即重试次数 -->\r\n";
                strVar += "     <retryTimes>1</retryTimes>\r\n";
                strVar += "     <!-- 队尾重试次数 -->\r\n";
                strVar += "     <cycleRetryTimes>1</cycleRetryTimes>\r\n";
                strVar += "     <!-- 队尾重试间隔 -->\r\n";
                strVar += "     <retrySleepTime>200</retrySleepTime>\r\n";
                strVar += "     <!-- 超时时间 -->\r\n";
                strVar += "     <timeout>5000</timeout>\r\n";
                strVar += "     <!-- 是否Gzip -->\r\n";
                strVar += "     <useGzip>true</useGzip>\r\n";
                strVar += "     <!-- 不自动管理cookie -->\r\n";
                strVar += "     <disableCookieManagement>false</disableCookieManagement>\r\n";
                strVar += "     <!-- 线程数 -->\r\n";
                strVar += "     <threadSize>10</threadSize>\r\n";
                strVar += "     <!-- 是否重置队列 -->\r\n";
                strVar += "     <resetQueue>false</resetQueue>\r\n";
                strVar += "     <!-- 是否启用优先级设置 -->\r\n";
                strVar += "     <usePriority>false</usePriority>\r\n";
                strVar += "     <!-- 是否使用redis -->\r\n";
                strVar += "     <useRedis>false</useRedis>\r\n";
                strVar += "     <!-- 是否使用数据库 -->\r\n";
                strVar += "     <useDb>false</useDb>\r\n";
                strVar += "     <!-- 是否使用Phantomjs浏览器渲染网页 -->\r\n";
                strVar += "     <usePhantomjs>false</usePhantomjs>\r\n";
                strVar += "     <!-- 起始爬取地址,多个值\",\"分隔-->\r\n";
                strVar += "     <url>\r\n";
                strVar += "            <![CDATA[\r\n";
                strVar += "                http://www.iqiyi.com/lib/s_215641805.html , http://www.iqiyi.com/lib/s_234967205.html\r\n";
                strVar += "            ]]>\r\n";
                strVar += "     </url>\r\n";
                strVar += "     <!-- 成功的消息返回码 -->\r\n";
                strVar += "     <acceptStatCode>200</acceptStatCode>\r\n";
                strVar += "\r\n";
                strVar += "     <!-- 字符串格式 -->\r\n";
                strVar += "     <!-- <cookie> \r\n";
                strVar += "         <![CDATA[ \r\n";
                strVar += "         __guid=26936763.670147605910933600.1510325694223.3628; \r\n";
                strVar += "         monitor_count=1; \r\n";
                strVar += "         __utma=55973678.1155684392.1510325696.1510325696.1510325696.1; \r\n";
                strVar += "         __utmc=55973678; \r\n";
                strVar += "         __utmz=55973678.1510325696.1.1.utmccn=(organic)|utmcsr=baidu|utmctr=|utmcmd=organic \r\n";
                strVar += "         ]]> \r\n";
                strVar += "     </cookie>  -->\r\n";
                strVar += "     <!-- 消息头 -->\r\n";
                strVar += "     <!-- <header> \r\n";
                strVar += "         <![CDATA[ \r\n";
                strVar += "         Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 \r\n";
                strVar += "         Accept-Encoding:gzip, deflate, sdch Accept-Language:zh-CN,zh;q=0.8 Cache-Control:max-age=0 \r\n";
                strVar += "         Connection:keep-alive Cookie:__guid=26936763.670147605910933600.1510325694223.3628; monitor_count=1; __utma=55973678.1155684392.1510325696.1510325696.1510325696.1; __utmc=55973678; __utmz=55973678.1510325696.1.1.utmccn=(organic)|utmcsr=baidu|utmctr=|utmcmd=organic \r\n";
                strVar += "         Host:www.tutorialspoint.com Upgrade-Insecure-Requests:1\r\n";
                strVar += "         User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 \r\n";
                strVar += "         ]]> \r\n";
                strVar += "     </header> -->  \r\n";
                strVar += "\r\n";
                strVar += "     <!-- 对象格式 -->\r\n";
                strVar += "     <!-- <cookies> \r\n";
                strVar += "         <cookie key=\"a\">1</cookie> \r\n";
                strVar += "         <cookie key=\"c\">2</cookie> \r\n";
                strVar += "         <cookie key=\"c\">3</cookie> \r\n";
                strVar += "     </cookies> \r\n";
                strVar += "     <headers> \r\n";
                strVar += "         <header key=\"e\">4</header> \r\n";
                strVar += "         <header key=\"f\">5</header> \r\n";
                strVar += "         <header key=\"g\">6</header> \r\n";
                strVar += "     </headers> \r\n";
                strVar += "     \r\n";
                strVar += "     <urls> \r\n";
                strVar += "         <url>f</url>           \r\n";
                strVar += "         <url>e</url> \r\n";
                strVar += "         <url>v</url> \r\n";
                strVar += "     </urls>\r\n";
                strVar += "     <acceptStatCodes> \r\n";
                strVar += "         <acceptStatCode>200</acceptStatCode> \r\n";
                strVar += "     </acceptStatCodes> -->\r\n";
                strVar += "     <!-- post请求头 -->\r\n";
                strVar += "     <!-- <requests> \r\n";
                strVar += "         <request> \r\n";
                strVar += "             <url>aa</url> \r\n";
                strVar += "             <method>post</method> \r\n";
                strVar += "             <requestBody>sss</requestBody> \r\n";
                strVar += "             <priority>11</priority> \r\n";
                strVar += "             <cookie> \r\n";
                strVar += "                 <![CDATA[ \r\n";
                strVar += "                 __guid=26936763.670147605910933600.1510325694223.3628; \r\n";
                strVar += "                 monitor_count=1; \r\n";
                strVar += "                 __utma=55973678.1155684392.1510325696.1510325696.1510325696.1; \r\n";
                strVar += "                 __utmc=55973678; \r\n";
                strVar += "                 __utmz=55973678.1510325696.1.1.utmccn=(organic)|utmcsr=baidu|utmctr=|utmcmd=organic \r\n";
                strVar += "                 ]]> \r\n";
                strVar += "             </cookie>\r\n";
                strVar += "             <header> \r\n";
                strVar += "                 <![CDATA[ \r\n";
                strVar += "                 Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 \r\n";
                strVar += "                 Accept-Encoding:gzip, deflate, sdch Accept-Language:zh-CN,zh;q=0.8 Cache-Control:max-age=0 \r\n";
                strVar += "                 Connection:keep-alive Cookie:__guid=26936763.670147605910933600.1510325694223.3628; monitor_count=1; __utma=55973678.1155684392.1510325696.1510325696.1510325696.1; __utmc=55973678; __utmz=55973678.1510325696.1.1.utmccn=(organic)|utmcsr=baidu|utmctr=|utmcmd=organic \r\n";
                strVar += "                 Host:www.tutorialspoint.com Upgrade-Insecure-Requests:1 User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 \r\n";
                strVar += "                 ]]> \r\n";
                strVar += "             </header> \r\n";
                strVar += "             <cookies> \r\n";
                strVar += "                 <cookie key=\"a\">1</cookie> \r\n";
                strVar += "                 <cookie key=\"c\">2</cookie>\r\n";
                strVar += "                 <cookie key=\"c\">3</cookie> \r\n";
                strVar += "             </cookies> \r\n";
                strVar += "             <headers> \r\n";
                strVar += "                 <header key=\"e\">4</header>\r\n";
                strVar += "                 <header key=\"f\">5</header>\r\n";
                strVar += "                 <header key=\"g\">6</header> \r\n";
                strVar += "             </headers> \r\n";
                strVar += "         </request> \r\n";
                strVar += "     </requests> -->\r\n";
                strVar += " </site>\r\n";
                strVar += " <!-- redis -->\r\n";
                strVar += " <redis>\r\n";
                strVar += "     <host>127.0.0.1</host>\r\n";
                strVar += "     <port>6379</port>\r\n";
                strVar += "     <password></password>\r\n";
                strVar += "     <timeout></timeout>\r\n";
                strVar += "     <dbIndex></dbIndex>\r\n";
                strVar += " </redis>\r\n";
                strVar += " <!-- mysql,oracle,pgsql,sqlserver,sqlite3,ansi -->\r\n";
                strVar += " <databases>\r\n";
                strVar += "     <database name=\"mysqldb\" type=\"mysql\">\r\n";
                strVar += "         <!-- 数据库链接 -->\r\n";
                strVar += "         <url>\r\n";
                strVar += "           <![CDATA[\r\n";
                strVar += "               jdbc:mysql://192.168.30.253:3306/lcs_launcher?characterEncoding=utf-8&autoReconnect=true&failOverReadOnly=false&allowMultiQueries=true\r\n";
                strVar += "           ]]>\r\n";
                strVar += "         </url>\r\n";
                strVar += "         <!-- 连接数据库的用户名 -->\r\n";
                strVar += "         <userName>root</userName>\r\n";
                strVar += "         <!-- 连接数据库的密码 -->\r\n";
                strVar += "         <password>root</password>\r\n";
                strVar += "         <!-- 数据库驱动名（这一项可配可不配） -->\r\n";
                strVar += "         <!--<driver>com.mysql.jdbc.Driver</driver> -->\r\n";
                strVar += "         <!-- 初始化时建立物理连接的个数 -->\r\n";
                strVar += "         <initialSize>2</initialSize>\r\n";
                strVar += "         <!-- 最小连接池数量 -->\r\n";
                strVar += "         <minIdle>1</minIdle>\r\n";
                strVar += "         <!-- 最大连接池数量 -->\r\n";
                strVar += "         <maxActive>5</maxActive>\r\n";
                strVar += "     </database>\r\n";
                strVar += " </databases>\r\n";
                strVar += " <!-- 结果处理 -->\r\n";
                strVar += " <pipeline>\r\n";
                strVar += "     <![CDATA[\r\n";
                strVar += "         import org.webmagic.*;\r\n";
                strVar += "         import com.jfinal.plugin.activerecord.*;\r\n";
                strVar += "\r\n";
                strVar += "         def pipeline(bean,task){\r\n";
                strVar += "             println(bean);\r\n";
                strVar += "             println(bean.iqiyisub.name);\r\n";
                strVar += "         }\r\n";
                strVar += "      ]]>\r\n";
                strVar += " </pipeline>\r\n";
                strVar += " <!-- 抽取模板 -->\r\n";
                strVar += " <models>\r\n";
                strVar += "     <model>\r\n";
                strVar += "         <!-- name=\"name\"(模板名称) leaf=\"false\"(是否子模板) -->\r\n";
                strVar += "         <bean name=\"IqiyiPage\">\r\n";
                strVar += "             <!-- name=\"name\"(字段名称) type=\"string\"(字段类型:byte,char/character,short,int/integer,long,double,float,str/string,boolean,list,set,object) leafid=\"leadid\"(子模板name) foundflag=\"false\"(作为url继续爬取) transmitflag=\"false\"(值往后传递) saveflag=\"true\"(抽取值是否保存到结果) -->\r\n";
                strVar += "             <field name=\"name\">\r\n";
                strVar += "                 <!-- expression=\"//hi\"(表达式) type=\"mixe\"(表达式类型:mixe,css,regex,xpath,json,filter,replace,split) notNull=\"false\"(结果是否允许不为空) multi=\"false\"(list抽取) -->\r\n";
                strVar += "                 <extract expression=\"xpath('//h1[@itemprop='name']/text()')\" />\r\n";
                strVar += "             </field>\r\n";
                strVar += "             <field name=\"names\">\r\n";
                strVar += "                 <extract expression=\"xpath('//h1[@itemprop='name']/text()')\"\r\n";
                strVar += "                     notnull=\"false\" />\r\n";
                strVar += "             </field>\r\n";
                strVar += "\r\n";
                strVar += "             <field name=\"iqiyisub\" type=\"object\" leafid=\"iqiyi1\">\r\n";
                strVar += "                 <extract expression=\"*\" />\r\n";
                strVar += "             </field>\r\n";
                strVar += "         </bean>\r\n";
                strVar += "         <!-- region=\"//hi\"(抽取区域) -->\r\n";
                strVar += "         <tagurl>\r\n";
                strVar += "             <expression>http://www.iqiyi.com/lib/s_\\d+.html</expression>\r\n";
                strVar += "         </tagurl>\r\n";
                strVar += "     </model>\r\n";
                strVar += "     <model>\r\n";
                strVar += "         <bean name=\"iqiyi1\" leaf=\"true\">\r\n";
                strVar += "             <field name=\"name\">\r\n";
                strVar += "                 <extract expression=\"//h1[@itemprop='name']/text()\" />\r\n";
                strVar += "             </field>\r\n";
                strVar += "         </bean>\r\n";
                strVar += "     </model>\r\n";
                strVar += " </models>\r\n";
                strVar += " <!-- 任务定时器 -->\r\n";
                strVar += " <task>39 16 * * *</task>\r\n";
                strVar += "</spider>\r\n";

                FileOutputStream fo = new FileOutputStream(fe);
                fo.write(strVar.getBytes("UTF-8"));
                fo.flush();
                fo.close();
            } else if (args[0].equals("--run")) {
                DOMConfigurator.configure("log4j.xml");
                Spider spider = Xml2Models.parse(args[1], Spider.class);
                spider.run();
            }
        } else {
            System.out.println("you can create a new spider config like this:");
            System.out.println("java -jar webmagic-xml.jar --create config.xml");
            System.out.println("you can run a spider like this:");
            System.out.println("java -jar webmagic-xml.jar --run config.xml");
        }
    }

}
